{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DoWhy example on the Lalonde dataset\n",
    "\n",
    "Thanks to [@mizuy](https://github.com/mizuy) for providing this example. Here we use the Lalonde dataset and apply IPW estimator to it. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: MASS\n",
      "\n",
      "R[write to console]: ## \n",
      "##  Matching (Version 4.9-6, Build Date: 2019-04-07)\n",
      "##  See http://sekhon.berkeley.edu/matching for additional documentation.\n",
      "##  Please cite software as:\n",
      "##   Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching\n",
      "##   Software with Automated Balance Optimization: The Matching package for R.''\n",
      "##   Journal of Statistical Software, 42(7): 1-52. \n",
      "##\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array(['Matching', 'MASS', 'tools', 'stats', 'graphics', 'grDevices',\n",
       "       'utils', 'datasets', 'methods', 'base'], dtype='<U9')"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os, sys\n",
    "sys.path.append(os.path.abspath(\"../../../\"))\n",
    "\n",
    "import dowhy\n",
    "from dowhy import CausalModel\n",
    "from rpy2.robjects import r as R\n",
    "%load_ext rpy2.ipython\n",
    "\n",
    "#%R install.packages(\"Matching\")\n",
    "%R library(Matching)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%R data(lalonde)\n",
    "%R -o lalonde\n",
    "lalonde = lalonde.astype({'treat':'bool'}, copy=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run DoWhy analysis: model, identify, estimate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:dowhy.causal_model:Causal Graph not provided. DoWhy will construct a graph based on data inputs.\n",
      "INFO:dowhy.causal_graph:If this is observed data (not from a randomized experiment), there might always be missing confounders. Adding a node named \"Unobserved Confounders\" to reflect this.\n",
      "INFO:dowhy.causal_model:Model to find the causal effect of treatment ['treat'] on outcome ['re78']\n",
      "INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['educ', 'nodegr', 'married', 'black', 'U', 'age', 'hisp']\n",
      "WARNING:dowhy.causal_identifier:If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARN: Do you want to continue by ignoring any unobserved confounders? (use proceed_when_unidentifiable=True to disable this prompt) [y/n] y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]\n",
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
      "INFO:dowhy.causal_estimator:b: re78~treat+educ+nodegr+married+black+age+hisp\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Causal Estimate is 1614.0090222453164\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/amshar/python-environments/vpy36/lib/python3.6/site-packages/sklearn/utils/validation.py:744: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    }
   ],
   "source": [
    "model=CausalModel(\n",
    "        data = lalonde,\n",
    "        treatment='treat',\n",
    "        outcome='re78',\n",
    "        common_causes='nodegr+black+hisp+age+educ+married'.split('+'))\n",
    "identified_estimand = model.identify_effect()\n",
    "estimate = model.estimate_effect(identified_estimand,\n",
    "        method_name=\"backdoor.propensity_score_weighting\")\n",
    "#print(estimate)\n",
    "print(\"Causal Estimate is \" + str(estimate.value))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sanity check: compare to manual IPW estimate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Causal Estimate is 1639.7820238870836\n"
     ]
    }
   ],
   "source": [
    "df = model._data\n",
    "ps = df['ps']\n",
    "y = df['re78']\n",
    "z = df['treat']\n",
    "\n",
    "ey1 = z*y/ps / sum(z/ps)\n",
    "ey0 = (1-z)*y/(1-ps) / sum((1-z)/(1-ps))\n",
    "ate = ey1.sum()-ey0.sum()\n",
    "print(\"Causal Estimate is \" + str(ate))\n",
    "\n",
    "# correct -> Causal Estimate is 1634.9868359746906"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
